#### CODE FOR TABLE 2 ####
library(randomForest)
library(tidyverse)
library(ggpubr)

# set working directory
setwd("data/analysis")

# import pooled set
all_data <- read.csv("pooled_data.csv")

# converting variables to factors
all_data$pid7_F <- factor(all_data$PartyID7)
all_data$ATTEND_F <- factor(all_data$ATTEND)
all_data$INCOME_F <- factor(all_data$INCOME)
all_data$EDUC_F <- factor(all_data$EDUC)
all_data$EMPLOY_F <- factor(all_data$EMPLOY)
all_data$HOUSING_F <- factor(all_data$HOUSING)

# drop missing data
all_data <- filter(all_data, !is.na(TNRFU), !is.na(pid7_F), !is.na(ATTEND_F))

# drop duplicates on these covariates
all_data <- all_data |> distinct(TNRFU, GENDER2, EDUC_F, EMPLOY_F, HOME_TYPE5, 
                               INCOME_F, STATE, MARITAL6, INTERNET, PHONESERVICE5, 
                               ATTEND_F, METRO, pid7_F, HOUSING_F, HHSIZE, RACE, 
                     .keep_all = T)

# set random seed
set.seed(78712)

# vector for subsetting training set (1/10 sample)
test <- sample(nrow(all_data), nrow(all_data)/10)

# learn random forest model predicting reluctant respondent
rf.nrfu <- randomForest(factor(TNRFU) ~ AGE + GENDER2 + RACE + EDUC_F + 
                              MARITAL6 + EMPLOY_F + INCOME_F + METRO + INTERNET + 
                              pid7_F + ATTEND_F + HOUSING_F + HOME_TYPE5 + 
                              PHONESERVICE5 + HHSIZE + STATE, 
                        data = all_data, subset = -test, mtry = 4, ntree = 600,
                        importance = T)

##### ROWS 1, 2, and 4 (OUT-OF-BAG) of TABLE 2 #####
rf.nrfu
oob.error <- .1523

##### ROW 5 (TEST SET) of TABLE 2 #####
# calculating classification rate on test data set
yhat <- predict(rf.nrfu, newdata = all_data[test, ])
yhat <- ifelse(yhat == 0, 0, 1)
nrfu.test <- all_data[test, "TNRFU"]
1 - mean(yhat == nrfu.test)
test_set.error <- 1 - mean(yhat == nrfu.test)

##### ROW 6 (AUROC) of TABLE 2 #####
rf_p_train <- as.numeric(predict(rf.nrfu, newdata = all_data[test, ], type="resp"))
rf_pr_train <- ROCR::prediction(rf_p_train, nrfu.test)
r_auc_train1 <- ROCR::performance(rf_pr_train, measure = "auc")@y.values[[1]] 
r_auc_train1

##### SAMPLE PROPORTIONS #####
prop.table(table(all_data$NRFU))

table2 <- data.frame(rf.nrfu$confusion) # confusion matrix

# add sample proportions of eager and reluctant respondents
table2 <- add_column(table2, sample_proportions = prop.table(table(all_data$NRFU)))

# part 1 of table 2, first two rows
table2 |> 
      xtable(digits = 3) |>
      print(include.rownames = FALSE) |>
      write_file(file = "../../results/Table_2_pt1.tex")

# part 2 of table 2, Out-of-bag error rate
data.frame(oob.error = oob.error) |>
      xtable(digits = 3) |>
      print(include.rownames = FALSE) |>
      write_file(file = "../../results/Table_2_pt2.tex")

# part 3 of table 2, test-set error rate
data.frame(test_set.error = test_set.error) |>
      xtable(digits = 3) |>
      print(include.rownames = FALSE) |>
      write_file(file = "../../results/Table_2_pt3.tex")

# part 4 of table 2, AUROC
data.frame(r_auc_train1 = r_auc_train1) |>
      xtable(digits = 3) |>
      print(include.rownames = FALSE) |>
      write_file(file = "../../results/Table_2_pt4.tex")


#### APPENDIX FIGURE H5 ####
# PLOTTING VARIABLE IMPORTANCE
# obtain importance statistics
imp_plot_data <- data.frame(rf.nrfu$importance)

# obtain SD of mean decrease accuracy
MDAsd <- rf.nrfu$importanceSD[,3]

# create mean decrease accuracy column that matches MeanDecreaseAccuracy in varImpPlot call
imp_plot_data$MeanDecAcc <- imp_plot_data$MeanDecreaseAccuracy / MDAsd

# create variable name column
imp_plot_data$variable <- rownames(imp_plot_data)
rownames(imp_plot_data) <- NULL
imp_plot_data$variable <- c("Age", "Gender", "Race", "Education", "Marital Status",
                            "Employment Status", "Income", "Metro Residence", "Internet Access",
                            "Party ID", "Religious Attendance", "Home Ownership", "Type of Building",
                            "Telephone Service", "Size of Household", "State")


plot_acc <- imp_plot_data %>%
      ggplot(aes(MeanDecAcc, reorder(variable, MeanDecAcc, sum))) +
      geom_point() + 
      ylab("Variable") + 
      xlab("Mean Decrease Accuracy") + 
      theme_bw() + 
      theme(text=element_text(size=16))

plot_gini <- imp_plot_data %>%
      ggplot(aes(MeanDecreaseGini, reorder(variable, MeanDecreaseGini, sum))) +
      geom_point() +
      ylab("") +
      xlab("Mean Decrease Gini") +
      theme_bw() + 
      theme(text=element_text(size=16))

var_imp_plot <- ggarrange(plot_acc, plot_gini)

# save
var_imp_plot
ggsave("../../results/Figure_H5.pdf")
